package org.luosl.webmagicx.processor

import org.jsoup.Jsoup
import org.luosl.webmagicx.conf.{Props, SpiderConf}
import us.codecraft.webmagic.Page
import us.codecraft.webmagic.selector.{Html, Selectable}

import scala.collection.JavaConverters._
import scala.collection.mutable

/**
  * Created by luosl on 2017/11/6.
  */
class GeneralProcessor(sc: SpiderConf, props:Props) extends BaseProcessor(sc, props){

  /**
    * 链接处理
    * @param page 页面
    */
  def linksProcess(page: Page): Unit ={
    val nowDeep:Int = page.getRequest.getDeep
    if(sc.attribute.maxDeep == -1 || nowDeep <= sc.attribute.maxDeep){
      extractLinks(page,nowDeep+1)
    }else{
      logInfo(s"当前深度:$nowDeep,当前指定最大深度${sc.attribute.maxDeep},跳过链接抽取环节...")
    }
  }

  /**
    * 抽取链接
    * @param page 页面
    */
  def extractLinks(page: Page,deep:Int): Unit ={
    val links:Seq[mutable.Buffer[String]] = sc.targetUrlRegexs.map{ regexAndXpath=>
      val selectable: Selectable = if(regexAndXpath.xpath.isEmpty){
        page.getHtml
      }else{
        page.getHtml.xpath(regexAndXpath.xpath.get)
      }
      selectable.links().regex(regexAndXpath.regex).all().asScala.map(_.split("#")(0))
    }
    page.addTargetRequests(links.flatten.asJava,deep)
  }

  override def process(page: Page): Unit = {
    try{
      val currentUrl:String = page.getRequest.getUrl.split("#")(0)
      page.putField("_url",currentUrl)
      page.putField("_page",page.getHtml.toString)
      linksProcess(page)
      sc.fields.foreach{ field=>
        if(!page.isSkip){
          // 根据配置的xpath抽取数据
          val extracts:Seq[String] = field.xpaths.map{ xpath=>
            val selectable:Selectable = page.getHtml.xpath(xpath)
            val textWithTag:String = selectable.all().asScala.filterNot(_ == null).mkString
            if(field.textFormat) Jsoup.parse(textWithTag).text() else textWithTag
          }
          // 根据xpathSelector再次进行赛选
          val xpathSelectorSelect:String = field.scope match {
            case "head" => extracts.headOption.orNull
            case "last" => extracts.lastOption.orNull
            case "all" => extracts.mkString
            case str:Any => throw new RuntimeException(s"无效的xpathScope[$str]，请检查配置文件:${sc.path}")
          }
          if(isBlank(xpathSelectorSelect) && field.must){
            logInfo(s"由于字段[name=${field.name},xpath=${field.xpaths.mkString(";")}]未抽取到数据,[url=$currentUrl]被跳过...")
            page.setSkip(true)
            onSkip(page)
          }else{
            page.putField(field.name,xpathSelectorSelect.trim)
          }
        }
      }
      if(!page.isSkip){
        onSuccess(page)
      }
    }catch {
      case _:Exception => onError(page)
    }
  }

  /**
    * 抽取字段内容
    * @param xpath xpath
    * @param scope scope
    * @param textFormat textFormat
    * @param page page
    * @return
    */
  private def extractFieldContent(xpath:String,scope:String,textFormat:Boolean,page: Page):String ={
    val selectable:Selectable = page.getHtml.xpath(xpath)
    val selectText:String = scope match {
      case "get" => selectable.get()
      case "all" => selectable.all().asScala.mkString
      case exe:Any => throw new RuntimeException(s"无效的scope:$exe,请检查配置文件:${sc.path}")
    }
    if(isBlank(selectText)){
      null
    }else{
      if(textFormat) Jsoup.parse(selectText).text() else selectText
    }
  }

  /**
    * 判断文本是否为空
    * @param str str
    * @return
    */
  def isBlank(str:String):Boolean = str==null||str.trim.isEmpty


}
